******************************************************************************************
* Do-file name:	an_reg_first_stage.do                                                    
* Task:         run first stage regression and create figure                             
* Last change:  04.04.2023                                                               
* Notes:                                                                                 
/*
This file contains the code used to generate the results presented in the following tables and figures:
- Figure A.3: czech_dist_bin_2g.pdf

- Table A.2:
		-- tab_first_stage.txt
*/
******************************************************************************************



******************************************************************************************
*** program setup
******************************************************************************************

version 14.2
clear all
macro drop _all
set linesize 90
set more off
* set trace on
discard
set seed 123456789
*set matsize 2000



******************************************************************************************
*** load working dataset
******************************************************************************************

use "data/work_first_stage_temp.dta", clear



******************************************************************************************
*** calculate czech share and other variables for first stage regression
******************************************************************************************

keep ao_gem_imp ao_kreis_imp year border_imp control_imp czech native emp_tot emp_nat imp_edu_2_low imp_edu_2_high imp_edu_3_low imp_edu_3_med imp_edu_3_high
drop if ao_kreis_imp == .	// drop the obs created by fillin
reshape wide czech native emp_tot emp_nat imp_edu_2_low imp_edu_2_high imp_edu_3_low imp_edu_3_med imp_edu_3_high, i(ao_gem_imp) j(year) 

*** generate czechshares: base period is employment 1990
gen czechshare_91_90  			= (czech1991 - czech1990) / emp_tot1990  
gen czechshare_92_90 			= (czech1992 - czech1990) / emp_tot1990  

label var czechshare_91_90 			"delta czech share (1991-1990)/employment (1990)"
label var czechshare_92_90 			"delta czech share (1992-1990)/employment (1990)"

label var imp_edu_2_low1990			"Share low educated in 1990 (based on imp_edu)"
label var imp_edu_2_high1990 		"Share high educated in 1990 (based on imp_edu)"
label var emp_tot1990			    "Number of employed individuals in 1990"

save "data/collapse_ao_gem_wide.dta", replace



******************************************************************************************
*** merge czech share to working dataset
******************************************************************************************

use "data/work_first_stage_temp.dta", clear

sort ao_gem_imp year
merge m:1 ao_gem_imp using "data/collapse_ao_gem_wide.dta", keepusing(ao_gem_imp czechshare_*)
drop if _merge == 2 		
drop _merge

save "data/work_first_stage.dta", replace



******************************************************************************************
*** create figures showing the relationship between distance and czech share in border & control regions
******************************************************************************************

*** binscatter plots

gen 	ln_distance = ln(distance)
gen 	fig_group = 1  if border_imp_13 == 1
replace fig_group = 2  if control_imp   == 1 


** by group (border vs control)		
binscatter  czechshare_92_90 ln_distance [aweight=emp_tot*weight_matching] if year == 1990 & (border_imp_13 == 1 | control_imp == 1), by(fig_group) ///
			nquantiles(45) line(qfit) reportreg ///
			ytitle(∆ Czech share (1992-1990), size(small)) ylabel(, labsize(small)) ymtick(##2) ///
			xtitle(Ln Distance (air-line) to nearest border crossing (in km), size(small)) ///
			xlabel(2.3 "10" 3.2 "25" 4.1 "60" 5 "150" 6 "400", labsize(small)) xmtick(2.75 "" 3.65 "" 4.55 "" 5.5 "") ///
			msymbols(smcircle smdiamond) mcolors(black gs11) ///
			legend(on order(1 "∆ Czech share (border regions)" 2 "∆ Czech share (control regions)" 3 "Fitted values (border regions)" 4 "Fitted values (control regions)") size(small)) ///
			graphregion(color(white))
graph export "figures/czech_dist_bin_2g.pdf", as(pdf) replace
	

** both groups together
// run binscatter command to get x-y-coordinates for the bins, use them in scatter plot and mark treatment and control groups differntly, bin 20 used as cut between treatment and
// control group, two bin groups at the border between treatment and control group are shared (i.e. include both tratment and control municipalities), therefore cut is not exact.
binscatter  czechshare_92_90 ln_distance [aweight=emp_tot*weight_matching] if year == 1990 & (border_imp_13 == 1 | control_imp == 1), ///
			genxq(bins) nquantiles(45) line(qfit) reportreg ///
			ytitle(∆ Czech share (1992-1990), size(small)) ylabel(, labsize(small)) ymtick(##2) ///
			xtitle(Ln Distance (air-line) to nearest border crossing (in km), size(small)) ///
			xlabel(2.3 "10" 3.2 "25" 4.1 "60" 5 "150" 6 "400", labsize(small)) xmtick(2.75 "" 3.65 "" 4.55 "" 5.5 "") ///
			msymbols(smcircle) mcolors(black) ///
			legend(on order(1 "∆ Czech share (1992-1990)" 2 "Fitted values") size(small)) ///
			graphregion(color(white))
graph export "figures/czech_dist_bin_1g.pdf", as(pdf) replace



******************************************************************************************
*** estimate first stage: municipality (gemeinde) level, total shock
******************************************************************************************

*** rescale distance variable from 1 km to 100 km to have larger coefficients in the table 
gen double distance_100 = distance/100
label var  distance_100 "distance (in 100 km)"


*** generate distance variable which is only positive for border regions 
gen double bdistance	= border_imp*distance_100
gen double bdistance_sq	= border_imp*distance_100*distance_100
label var bdistance		"distance only for border regions >0, 0 otherwise (based on distance_100)"
label var bdistance_sq	"bdistance squared"


*** only 13 treatment districts and no control districts
reg czechshare_92_90 c.distance_100##c.distance_100 [pweight=emp_tot*weight_matching]  if year == 1990 & border_imp_13 == 1, robust		
est store cz_92_90_13tdistr_noc
test c.distance_100  c.distance_100#c.distance_100
estadd scalar f_stat_dist = r(F)

predict czshare_92_90_predic_13 if border_imp_13 == 1
replace czshare_92_90_predic_13 = 0  if czshare_92_90_predic_13 < 0	

replace czshare_92_90_predic_13 = 0  if control_imp == 1
label var czshare_92_90_predic_13 "∆ czech share (1992-1990)/total employment (1990) (pred. 13t, all c = 0)"


********* bootstrap standard errors: 1992-1990 shock **********************************************
*** save 500 copies of first stage (and distribution of error terms used in each)
forval bs = 1/500	{
	preserve
	
	// code to restrict sample to relevant regions
	reg czechshare_92_90 c.distance_100##c.distance_100 [pweight=emp_tot*weight_matching]  if year == 1990 & (border_imp_13 == 1 | control_imp == 1), robust 
	keep if e(sample)
	drop czshare_92_90_predic_13
	
	* wild bootstrap
	if `bs' == 0 {
	gen wild = 1 	// first observation = don't mess with residuals
	}
	if `bs' != 0 {
		di "bootstrap sample `bs'"
		bysort ao_kreis_imp: gen wildtmp = uniform()  if _n == 1	// every district gets a random number
		replace wildtmp = -1  if wildtmp <  0.5 & wildtmp != .
		replace wildtmp =  1  if wildtmp >= 0.5 & wildtmp != .
		bysort ao_kreis_imp: egen wild = max(wildtmp)
		drop wildtmp
	}
	reg czechshare_92_90 c.distance_100##c.distance_100 [pweight=emp_tot*weight_matching]  if year == 1990 & border_imp_13 == 1, robust	
	
	* wild thing
	predict res if e(sample), res
	predict xb  if e(sample), xb
	gen y = xb + wild * res
	reg y c.distance_100##c.distance_100 [pweight=emp_tot*weight_matching]  if year == 1990 & border_imp_13 == 1, robust
	drop res xb y
	
	predict czshare_92_90_predic_13 if border_imp_13 == 1
	replace czshare_92_90_predic_13 = 0  if czshare_92_90_predic_13 < 0	

	replace czshare_92_90_predic_13 = 0  if control_imp == 1
	label var czshare_92_90_predic_13 "∆ czech share (1992-1990)/total employment (1990) (pred. 13t, all c = 0)"
	
	* save ao_gem_imp predicted czech shock and "wild" variable
	keep ao_gem_imp  czshare_92_90_predic_13  wild
	bysort ao_gem_imp: keep if _n == 1
	sort ao_gem_imp
	compress
	save "data\bootstrap/bs_wild_first_92_`bs'.dta", replace
	restore
	}

***************************************************************************************************


*** only 13 treatment districts and all control districts: effect of distance only for border districts
reg czechshare_92_90 i.border_imp c.bdistance c.bdistance_sq [pweight=emp_tot*weight_matching]  if year == 1990 & (border_imp_13 == 1 | control_imp == 1), robust		
est store cz_92_90_13tdistr
test c.bdistance  c.bdistance_sq
estadd scalar f_stat_dist = r(F)

predict czshare_92_90_predic_13_2 if border_imp_13 == 1 | control_imp == 1
replace czshare_92_90_predic_13_2 = 0  if czshare_92_90_predic_13_2 < 0	
label var czshare_92_90_predic_13_2 "∆ czech share (1992-1990)/total employment (1990) (pred. 13t, all c)"



*** create table
#delimit ;
global estout_first_stage "cells(b(star fmt(%9.3f) vacant(-)) se(fmt(%9.3f) par)) 
rename(bdistance distance_100 bdistance_sq c.distance_100#c.distance_100)
order(distance_100 c.distance_100#c.distance_100 _cons)
stats(N N_clust r2_a f_stat_dist, labels("No. municipalities" "Cluster (no. districts)" "Adjusted R2" "F-statistic") layout(@ @ @ @) fmt(%9.0fc %9.0fc %9.3f %9.2f))
starlevels(* 0.1 ** 0.05 *** 0.01) varwidth(35)
varlabels(_cons "Constant" distance_100 "Distance (/100)" c.distance_100#c.distance_100 "Distance (/100) squared" 1.border_imp "Border region")
prehead(@title) posthead() postfoot(@note) nonumbers collabels(none) style(tab)";
#delimit cr

estout cz_92_90_13tdistr_noc  cz_92_90_13tdistr ///
using "tables/tab_first_stage.txt", $estout_first_stage replace ///
drop(0b.border_imp) mlabels("13 border districts" "13 border & all control districts") ///
title(Table: First Stage: The Inflow of Czech Commuters and Distance to Border) ///
note(Notes: The table reports the coefficients from the first stage regression. Robust standard errors are applied. ///
Data Source: German Social Security Records, border districts and matched control districts, 1990 and 1992.)



*** czechshare_91_90: only 13 treatment districts and no control districts
reg czechshare_91_90 c.distance_100##c.distance_100 [pweight=emp_tot*weight_matching]  if year == 1990 & border_imp_13 == 1, robust		
est store cz_91_90_13tdistr_noc
test c.distance_100  c.distance_100#c.distance_100

predict czshare_91_90_predic_13 if border_imp_13 == 1
replace czshare_91_90_predic_13 = 0  if czshare_91_90_predic_13 < 0	

replace czshare_91_90_predic_13 = 0  if control_imp == 1
label var czshare_91_90_predic_13 "∆ czech share (1991-1990)/total employment (1990) (pred. 13t, all c = 0)"


********* bootstrap standard errors: 1991-1990 shock **********************************************
*** save 500 copies of first stage (and distribution of error terms used in each)
forval bs = 1/500 {
	preserve
	
	// code to restrict sample to relevant regions
	reg czechshare_91_90 c.distance_100##c.distance_100 [pweight=emp_tot*weight_matching]  if year == 1990 & (border_imp_13 == 1 | control_imp == 1), robust 
	keep if e(sample)
	drop czshare_91_90_predic_13
	
	* wild bootstrap
	if `bs' == 0 {
	gen wild = 1 	// first observation = don't mess with residuals
	}
	if `bs' != 0 {
		di "bootstrap sample `bs'"
		bysort ao_kreis_imp: gen wildtmp = uniform()  if _n == 1	// every district gets a random number
		replace wildtmp = -1  if wildtmp <  0.5 & wildtmp != .
		replace wildtmp =  1  if wildtmp >= 0.5 & wildtmp != .
		bysort ao_kreis_imp: egen wild = max(wildtmp)
		drop wildtmp
	}
	reg czechshare_91_90 c.distance_100##c.distance_100 [pweight=emp_tot*weight_matching]  if year == 1990 & border_imp_13 == 1, robust	
	
	* wild thing
	predict res if e(sample), res
	predict xb  if e(sample), xb
	gen y = xb + wild * res
	reg y c.distance_100##c.distance_100 [pweight=emp_tot*weight_matching]  if year == 1990 & border_imp_13 == 1, robust
	drop res xb y
	
	predict czshare_91_90_predic_13 if border_imp_13 == 1
	replace czshare_91_90_predic_13 = 0  if czshare_91_90_predic_13 < 0	

	replace czshare_91_90_predic_13 = 0  if control_imp == 1
	label var czshare_91_90_predic_13 "∆ czech share (1991-1990)/total employment (1990) (pred. 13t, all c = 0)"
	
	* save ao_gem_imp predicted czech shock and "wild" variable
	keep ao_gem_imp  czshare_91_90_predic_13  wild
	bysort ao_gem_imp: keep if _n == 1
	sort ao_gem_imp
	compress
	save "data\bootstrap/bs_wild_first_91_`bs'.dta", replace
	restore
	}

***************************************************************************************************



******************************************************************************************
*** match predicted czech share to estimation datasets
******************************************************************************************

*** save dataset
sort ao_gem_imp year
save "data/work_first_stage_h1.dta", replace		


** collapse to municipality level -> one obs for each municipality
collapse (mean) ao_kreis_imp border_imp border_imp_13 control_imp ost ktyp ///
				czechshare_* distance distance_100 bdistance bdistance_sq ///
				czshare_92_90_predic_13 czshare_92_90_predic_13_* czshare_91_90_predic_13, by (ao_gem_imp)	

compress
save "data/work_first_stage.dta", replace


*** match czech shares to estimation datasets
foreach x in employ_region_s1  employ_region_s2  ///
			 wage_region_s1     ///
			 work_indiv {
use "data/`x'.dta", clear
sort ao_gem_imp
merge m:1 ao_gem_imp using "data/work_first_stage.dta", ///
keepusing(ao_gem_imp border_imp_13 czechshare_* distance_100 bdistance bdistance_sq ///
czshare_92_90_predic_13 czshare_92_90_predic_13_* czshare_91_90_predic_13)
drop if _merge == 2
drop _merge		

compress
save "data/`x'.dta", replace
	}

*** match czech shares to estimation datasets: pseudo-panel
foreach x in wage_region_edu2  wage_region_edu3_age3  wage_region_sex2_edu3_age3 {
use "data\pseudo_panel/`x'.dta", clear
sort ao_gem_imp
merge m:1 ao_gem_imp using "data/work_first_stage.dta", ///
keepusing(ao_gem_imp border_imp_13 czechshare_* distance_100 bdistance bdistance_sq ///
czshare_92_90_predic_13 czshare_92_90_predic_13_* czshare_91_90_predic_13)
drop if _merge == 2
drop _merge		

compress
save "data\pseudo_panel/`x'.dta", replace
	} 


*erase "data\agg_level/work_first_stage_temp.dta"



******************************************************************************************
*** end
******************************************************************************************

exit


*========================================================================================*
Comments:
- unique identifier: ao_gem_imp
